{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5a5fc7e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n",
    "#\n",
    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "# you may not use this file except in compliance with the License.\n",
    "# You may obtain a copy of the License at\n",
    "#\n",
    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
    "#\n",
    "# Unless required by applicable law or agreed to in writing, software\n",
    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "# See the License for the specific language governing permissions and\n",
    "# limitations under the License.\n",
    "# =============================================================================="
   ]
  },
  {
   "cell_type": "markdown",
   "id": "07c1fc48",
   "metadata": {},
   "source": [
    "<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
    "\n",
    "# Creating Multi-Modal Movie Feature Store\n",
    "\n",
    "Finally, with both the text and image features ready, we now put the multi-modal movie features into a unified feature store.\n",
    "\n",
    "If you have downloaded the real data and proceeded through the feature extraction process in notebooks 03-05, then proceed to create the feature store. Else, skip to the `Synthetic data` section below to create random features.\n",
    "\n",
    "## Real data\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "05429d2e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "61947"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pickle\n",
    "\n",
    "with open('movies_poster_features.pkl', 'rb') as f:\n",
    "    poster_feature = pickle.load(f)[\"feature_dict\"]\n",
    "    \n",
    "len(poster_feature)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "88ecbfe2",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('movies_synopsis_embeddings-1024.pkl', 'rb') as f:\n",
    "    text_feature = pickle.load(f)[\"embeddings\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "513eaaa1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "61291"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(text_feature)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "be6513cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "links = pd.read_csv(\"./data/ml-25m/links.csv\", dtype={\"imdbId\": str})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ee6f8995",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(62423, 3)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "links.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ac1a3ff5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>imdbId</th>\n",
       "      <th>tmdbId</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0114709</td>\n",
       "      <td>862.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0113497</td>\n",
       "      <td>8844.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>0113228</td>\n",
       "      <td>15602.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>0114885</td>\n",
       "      <td>31357.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0113041</td>\n",
       "      <td>11862.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movieId   imdbId   tmdbId\n",
       "0        1  0114709    862.0\n",
       "1        2  0113497   8844.0\n",
       "2        3  0113228  15602.0\n",
       "3        4  0114885  31357.0\n",
       "4        5  0113041  11862.0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "links.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "92fdde95",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2048,)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "poster_feature['0105812'].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "9747318a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "feature_array = np.zeros((len(links), 1+2048+1024))\n",
    "\n",
    "for i, row in links.iterrows():\n",
    "    feature_array[i,0] = row['movieId']\n",
    "    if row['imdbId'] in poster_feature:\n",
    "        feature_array[i,1:2049] = poster_feature[row['imdbId']]\n",
    "    if row['movieId'] in text_feature:\n",
    "        feature_array[i,2049:] = text_feature[row['movieId']]\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "1e35af39",
   "metadata": {},
   "outputs": [],
   "source": [
    "dtype= {**{'movieId': np.int64},**{x: np.float32 for x in ['poster_feature_%d'%i for i in range(2048)]+['text_feature_%d'%i for i in range(1024)]}}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "e233c110",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3073"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dtype)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "19be2061",
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_df = pd.DataFrame(feature_array, columns=['movieId']+['poster_feature_%d'%i for i in range(2048)]+['text_feature_%d'%i for i in range(1024)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "c0fee60f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>poster_feature_0</th>\n",
       "      <th>poster_feature_1</th>\n",
       "      <th>poster_feature_2</th>\n",
       "      <th>poster_feature_3</th>\n",
       "      <th>poster_feature_4</th>\n",
       "      <th>poster_feature_5</th>\n",
       "      <th>poster_feature_6</th>\n",
       "      <th>poster_feature_7</th>\n",
       "      <th>poster_feature_8</th>\n",
       "      <th>...</th>\n",
       "      <th>text_feature_1014</th>\n",
       "      <th>text_feature_1015</th>\n",
       "      <th>text_feature_1016</th>\n",
       "      <th>text_feature_1017</th>\n",
       "      <th>text_feature_1018</th>\n",
       "      <th>text_feature_1019</th>\n",
       "      <th>text_feature_1020</th>\n",
       "      <th>text_feature_1021</th>\n",
       "      <th>text_feature_1022</th>\n",
       "      <th>text_feature_1023</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.088281</td>\n",
       "      <td>0.036760</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.006470</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.023553</td>\n",
       "      <td>0.000163</td>\n",
       "      <td>0.238797</td>\n",
       "      <td>...</td>\n",
       "      <td>0.291230</td>\n",
       "      <td>-0.197272</td>\n",
       "      <td>0.024294</td>\n",
       "      <td>1.307049</td>\n",
       "      <td>-0.789571</td>\n",
       "      <td>0.084938</td>\n",
       "      <td>-0.187339</td>\n",
       "      <td>0.061683</td>\n",
       "      <td>0.183281</td>\n",
       "      <td>-0.356245</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.289105</td>\n",
       "      <td>0.134672</td>\n",
       "      <td>0.691380</td>\n",
       "      <td>0.045417</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.051422</td>\n",
       "      <td>...</td>\n",
       "      <td>0.203168</td>\n",
       "      <td>-0.617449</td>\n",
       "      <td>0.443821</td>\n",
       "      <td>1.501953</td>\n",
       "      <td>-0.736949</td>\n",
       "      <td>0.180542</td>\n",
       "      <td>-0.313696</td>\n",
       "      <td>0.274087</td>\n",
       "      <td>0.153105</td>\n",
       "      <td>-0.218745</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.187553</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.904370</td>\n",
       "      <td>0.069441</td>\n",
       "      <td>0.026665</td>\n",
       "      <td>0.817211</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.125072</td>\n",
       "      <td>...</td>\n",
       "      <td>0.173140</td>\n",
       "      <td>-0.209240</td>\n",
       "      <td>0.451933</td>\n",
       "      <td>1.491917</td>\n",
       "      <td>-0.743956</td>\n",
       "      <td>-0.069061</td>\n",
       "      <td>-0.900011</td>\n",
       "      <td>0.583347</td>\n",
       "      <td>0.192817</td>\n",
       "      <td>0.224088</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.0</td>\n",
       "      <td>0.182279</td>\n",
       "      <td>0.014646</td>\n",
       "      <td>0.004135</td>\n",
       "      <td>0.197796</td>\n",
       "      <td>0.077938</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.215127</td>\n",
       "      <td>0.021160</td>\n",
       "      <td>0.023108</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.394012</td>\n",
       "      <td>0.679462</td>\n",
       "      <td>1.225475</td>\n",
       "      <td>1.196255</td>\n",
       "      <td>-0.169627</td>\n",
       "      <td>-0.008575</td>\n",
       "      <td>-0.172138</td>\n",
       "      <td>0.114755</td>\n",
       "      <td>-0.127861</td>\n",
       "      <td>-0.003679</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.082123</td>\n",
       "      <td>0.447287</td>\n",
       "      <td>0.002375</td>\n",
       "      <td>0.135956</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.989514</td>\n",
       "      <td>0.808180</td>\n",
       "      <td>0.317510</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.176658</td>\n",
       "      <td>-0.078992</td>\n",
       "      <td>0.726118</td>\n",
       "      <td>1.017430</td>\n",
       "      <td>-0.249834</td>\n",
       "      <td>0.183357</td>\n",
       "      <td>-0.071451</td>\n",
       "      <td>0.644567</td>\n",
       "      <td>0.090399</td>\n",
       "      <td>-1.147284</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 3073 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   movieId  poster_feature_0  poster_feature_1  poster_feature_2  \\\n",
       "0      1.0          0.000000          0.088281          0.036760   \n",
       "1      2.0          0.000000          0.000000          0.000000   \n",
       "2      3.0          0.000000          0.187553          0.000000   \n",
       "3      4.0          0.182279          0.014646          0.004135   \n",
       "4      5.0          0.000000          0.082123          0.447287   \n",
       "\n",
       "   poster_feature_3  poster_feature_4  poster_feature_5  poster_feature_6  \\\n",
       "0          0.000000          0.006470          0.000000          0.023553   \n",
       "1          0.289105          0.134672          0.691380          0.045417   \n",
       "2          0.904370          0.069441          0.026665          0.817211   \n",
       "3          0.197796          0.077938          0.000000          0.215127   \n",
       "4          0.002375          0.135956          0.000000          0.989514   \n",
       "\n",
       "   poster_feature_7  poster_feature_8  ...  text_feature_1014  \\\n",
       "0          0.000163          0.238797  ...           0.291230   \n",
       "1          0.000000          0.051422  ...           0.203168   \n",
       "2          0.000000          0.125072  ...           0.173140   \n",
       "3          0.021160          0.023108  ...          -0.394012   \n",
       "4          0.808180          0.317510  ...          -0.176658   \n",
       "\n",
       "   text_feature_1015  text_feature_1016  text_feature_1017  text_feature_1018  \\\n",
       "0          -0.197272           0.024294           1.307049          -0.789571   \n",
       "1          -0.617449           0.443821           1.501953          -0.736949   \n",
       "2          -0.209240           0.451933           1.491917          -0.743956   \n",
       "3           0.679462           1.225475           1.196255          -0.169627   \n",
       "4          -0.078992           0.726118           1.017430          -0.249834   \n",
       "\n",
       "   text_feature_1019  text_feature_1020  text_feature_1021  text_feature_1022  \\\n",
       "0           0.084938          -0.187339           0.061683           0.183281   \n",
       "1           0.180542          -0.313696           0.274087           0.153105   \n",
       "2          -0.069061          -0.900011           0.583347           0.192817   \n",
       "3          -0.008575          -0.172138           0.114755          -0.127861   \n",
       "4           0.183357          -0.071451           0.644567           0.090399   \n",
       "\n",
       "   text_feature_1023  \n",
       "0          -0.356245  \n",
       "1          -0.218745  \n",
       "2           0.224088  \n",
       "3          -0.003679  \n",
       "4          -1.147284  \n",
       "\n",
       "[5 rows x 3073 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "91d28970",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(62423, 3073)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "6675c618",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
      "Requirement already satisfied: pyarrow in /usr/local/lib/python3.8/dist-packages (1.0.1)\n",
      "Requirement already satisfied: numpy>=1.14 in /usr/local/lib/python3.8/dist-packages (from pyarrow) (1.20.3)\n",
      "\u001b[33mWARNING: You are using pip version 21.0.1; however, version 21.1.2 is available.\n",
      "You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install pyarrow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "74c262b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_df.to_parquet('feature_df.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f9018c20",
   "metadata": {},
   "source": [
    "\n",
    "## Synthetic data\n",
    "\n",
    "If you have not extrated image and text features from real data, proceed with this section to create synthetic features."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3be9162e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "links = pd.read_csv(\"./data/ml-25m/links.csv\", dtype={\"imdbId\": str})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "0f1607db",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "feature_array = np.random.rand(links.shape[0], 3073)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ceaf1f84",
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_array[:,0] = links['movieId'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "5303b942",
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_df = pd.DataFrame(feature_array, columns=['movieId']+['poster_feature_%d'%i for i in range(2048)]+['text_feature_%d'%i for i in range(1024)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "581fe177",
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_df.to_parquet('feature_df.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "35377d32",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>poster_feature_0</th>\n",
       "      <th>poster_feature_1</th>\n",
       "      <th>poster_feature_2</th>\n",
       "      <th>poster_feature_3</th>\n",
       "      <th>poster_feature_4</th>\n",
       "      <th>poster_feature_5</th>\n",
       "      <th>poster_feature_6</th>\n",
       "      <th>poster_feature_7</th>\n",
       "      <th>poster_feature_8</th>\n",
       "      <th>...</th>\n",
       "      <th>text_feature_1014</th>\n",
       "      <th>text_feature_1015</th>\n",
       "      <th>text_feature_1016</th>\n",
       "      <th>text_feature_1017</th>\n",
       "      <th>text_feature_1018</th>\n",
       "      <th>text_feature_1019</th>\n",
       "      <th>text_feature_1020</th>\n",
       "      <th>text_feature_1021</th>\n",
       "      <th>text_feature_1022</th>\n",
       "      <th>text_feature_1023</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.026260</td>\n",
       "      <td>0.857608</td>\n",
       "      <td>0.410247</td>\n",
       "      <td>0.066654</td>\n",
       "      <td>0.382803</td>\n",
       "      <td>0.899998</td>\n",
       "      <td>0.511562</td>\n",
       "      <td>0.592291</td>\n",
       "      <td>0.565434</td>\n",
       "      <td>...</td>\n",
       "      <td>0.636716</td>\n",
       "      <td>0.578369</td>\n",
       "      <td>0.996169</td>\n",
       "      <td>0.402107</td>\n",
       "      <td>0.412318</td>\n",
       "      <td>0.859952</td>\n",
       "      <td>0.293852</td>\n",
       "      <td>0.341114</td>\n",
       "      <td>0.727113</td>\n",
       "      <td>0.085829</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2.0</td>\n",
       "      <td>0.141265</td>\n",
       "      <td>0.721758</td>\n",
       "      <td>0.679958</td>\n",
       "      <td>0.955634</td>\n",
       "      <td>0.391091</td>\n",
       "      <td>0.324611</td>\n",
       "      <td>0.505211</td>\n",
       "      <td>0.258331</td>\n",
       "      <td>0.048264</td>\n",
       "      <td>...</td>\n",
       "      <td>0.161505</td>\n",
       "      <td>0.431864</td>\n",
       "      <td>0.836532</td>\n",
       "      <td>0.525013</td>\n",
       "      <td>0.654566</td>\n",
       "      <td>0.823841</td>\n",
       "      <td>0.818313</td>\n",
       "      <td>0.856280</td>\n",
       "      <td>0.638048</td>\n",
       "      <td>0.685537</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>0.119418</td>\n",
       "      <td>0.911146</td>\n",
       "      <td>0.470762</td>\n",
       "      <td>0.762258</td>\n",
       "      <td>0.626335</td>\n",
       "      <td>0.768947</td>\n",
       "      <td>0.241833</td>\n",
       "      <td>0.775992</td>\n",
       "      <td>0.236340</td>\n",
       "      <td>...</td>\n",
       "      <td>0.865548</td>\n",
       "      <td>0.387806</td>\n",
       "      <td>0.668321</td>\n",
       "      <td>0.552122</td>\n",
       "      <td>0.750238</td>\n",
       "      <td>0.863707</td>\n",
       "      <td>0.382173</td>\n",
       "      <td>0.894487</td>\n",
       "      <td>0.565142</td>\n",
       "      <td>0.164083</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.0</td>\n",
       "      <td>0.538184</td>\n",
       "      <td>0.980678</td>\n",
       "      <td>0.643513</td>\n",
       "      <td>0.928519</td>\n",
       "      <td>0.794906</td>\n",
       "      <td>0.201022</td>\n",
       "      <td>0.744666</td>\n",
       "      <td>0.962188</td>\n",
       "      <td>0.915320</td>\n",
       "      <td>...</td>\n",
       "      <td>0.777534</td>\n",
       "      <td>0.904200</td>\n",
       "      <td>0.167337</td>\n",
       "      <td>0.875194</td>\n",
       "      <td>0.180481</td>\n",
       "      <td>0.815904</td>\n",
       "      <td>0.808288</td>\n",
       "      <td>0.036711</td>\n",
       "      <td>0.902779</td>\n",
       "      <td>0.580946</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>0.772951</td>\n",
       "      <td>0.239788</td>\n",
       "      <td>0.061874</td>\n",
       "      <td>0.162997</td>\n",
       "      <td>0.388310</td>\n",
       "      <td>0.236311</td>\n",
       "      <td>0.162757</td>\n",
       "      <td>0.207134</td>\n",
       "      <td>0.111078</td>\n",
       "      <td>...</td>\n",
       "      <td>0.250022</td>\n",
       "      <td>0.335043</td>\n",
       "      <td>0.091674</td>\n",
       "      <td>0.121507</td>\n",
       "      <td>0.418124</td>\n",
       "      <td>0.150020</td>\n",
       "      <td>0.803506</td>\n",
       "      <td>0.059504</td>\n",
       "      <td>0.002342</td>\n",
       "      <td>0.932321</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 3073 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   movieId  poster_feature_0  poster_feature_1  poster_feature_2  \\\n",
       "0      1.0          0.026260          0.857608          0.410247   \n",
       "1      2.0          0.141265          0.721758          0.679958   \n",
       "2      3.0          0.119418          0.911146          0.470762   \n",
       "3      4.0          0.538184          0.980678          0.643513   \n",
       "4      5.0          0.772951          0.239788          0.061874   \n",
       "\n",
       "   poster_feature_3  poster_feature_4  poster_feature_5  poster_feature_6  \\\n",
       "0          0.066654          0.382803          0.899998          0.511562   \n",
       "1          0.955634          0.391091          0.324611          0.505211   \n",
       "2          0.762258          0.626335          0.768947          0.241833   \n",
       "3          0.928519          0.794906          0.201022          0.744666   \n",
       "4          0.162997          0.388310          0.236311          0.162757   \n",
       "\n",
       "   poster_feature_7  poster_feature_8  ...  text_feature_1014  \\\n",
       "0          0.592291          0.565434  ...           0.636716   \n",
       "1          0.258331          0.048264  ...           0.161505   \n",
       "2          0.775992          0.236340  ...           0.865548   \n",
       "3          0.962188          0.915320  ...           0.777534   \n",
       "4          0.207134          0.111078  ...           0.250022   \n",
       "\n",
       "   text_feature_1015  text_feature_1016  text_feature_1017  text_feature_1018  \\\n",
       "0           0.578369           0.996169           0.402107           0.412318   \n",
       "1           0.431864           0.836532           0.525013           0.654566   \n",
       "2           0.387806           0.668321           0.552122           0.750238   \n",
       "3           0.904200           0.167337           0.875194           0.180481   \n",
       "4           0.335043           0.091674           0.121507           0.418124   \n",
       "\n",
       "   text_feature_1019  text_feature_1020  text_feature_1021  text_feature_1022  \\\n",
       "0           0.859952           0.293852           0.341114           0.727113   \n",
       "1           0.823841           0.818313           0.856280           0.638048   \n",
       "2           0.863707           0.382173           0.894487           0.565142   \n",
       "3           0.815904           0.808288           0.036711           0.902779   \n",
       "4           0.150020           0.803506           0.059504           0.002342   \n",
       "\n",
       "   text_feature_1023  \n",
       "0           0.085829  \n",
       "1           0.685537  \n",
       "2           0.164083  \n",
       "3           0.580946  \n",
       "4           0.932321  \n",
       "\n",
       "[5 rows x 3073 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "129ab953",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
